PART 1¶

Select¶

In [1]:
import pandas as pd
import plotly.express as px

# Load the dataset
df = pd.read_csv("netflix_titles.csv")

# Initialize the "is_interesting" column
df["is_interesting"] = False

# Create a scatter plot
fig = px.scatter(df, x='release_year', y='duration', color='type',
                 title='Netflix Movies and TV Shows',
                 labels={'release_year': 'Release Year', 'duration': 'Duration (minutes)'},
                 hover_data={'title': True})

# Display the plot
fig.show()

Explore¶

In [2]:
# Let's load the dataset first to see its structure and understand how to modify the provided code
import pandas as pd

# Load the Netflix dataset
netflix_df = pd.read_csv('netflix_titles.csv')

# Visualize the frequency of Netflix content by its rating
rating_freq = netflix_df.rating.value_counts()
ax = rating_freq.plot.bar(color='skyblue')
ax.set_title('Netflix Content by Rating', color='black', fontsize=15)
ax.set_xlabel('Rating', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)

# Annotate the height of each bar for better readability
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

Reconfigure¶

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Load the dataset
netflix_df = pd.read_csv('netflix_titles.csv')

# Prepare the data
netflix_df['country'] = netflix_df['country'].str.split(', ')
country_data = netflix_df.explode('country')
country_release_counts = country_data.groupby(['country', 'release_year']).size().reset_index(name='counts')
top_countries = country_release_counts.groupby('country')['counts'].sum().nlargest(10).index
top_country_data = country_release_counts[country_release_counts['country'].isin(top_countries)]

# Create an output widget to hold the plot
plot_output = widgets.Output()

# Main function to create heatmap
def create_heatmap(order_by='total_count'):
    if order_by == 'country':
        country_order = sorted(top_country_data['country'].unique())
    elif order_by == 'total_count':
        country_order = top_country_data.groupby('country')['counts'].sum().sort_values(ascending=False).index

    ordered_data = top_country_data[top_country_data['country'].isin(country_order)]
    ordered_pivot = ordered_data.pivot("country", "release_year", "counts").reindex(country_order).fillna(0).astype(int)

    with plot_output:
        plot_output.clear_output(wait=True)  # Clear the previous plot
        plt.figure(figsize=(20, 8))
        sns.heatmap(ordered_pivot, cmap="YlGnBu", linewidths=.5, annot=True, fmt="d")
        plt.title('Number of Titles Released per Year for Top 10 Countries on Netflix')
        plt.ylabel('Country')
        plt.xlabel('Release Year')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()

# Dropdown widget
order_by_widget = widgets.Dropdown(
    options=['Country', 'Total Count'],
    value='Total Count',
    description='Order by:',
    disabled=False,
)

# Event handler for the dropdown, to update the heatmap based on selection
def on_order_change(change):
    create_heatmap(order_by=change.new.lower().replace(' ', '_'))

order_by_widget.observe(on_order_change, names='value')

# Display the widget and the initial plot
display(order_by_widget, plot_output)
create_heatmap()  # Generate the initial heatmap
Dropdown(description='Order by:', index=1, options=('Country', 'Total Count'), value='Total Count')
Output()

Encode¶

In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from collections import Counter

# Load and prepare the data (assuming the dataset is loaded into `df`)
df = pd.read_csv('netflix_titles.csv')
country_counts = Counter([country.strip() for sublist in df['country'].dropna().str.split(', ') for country in sublist])
country_counts_df = pd.DataFrame(country_counts.items(), columns=['Country', 'Count'])

# Create subplot figure with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "choropleth"}, {"type": "choropleth"}]],
                    subplot_titles=("Electric Color Scale", "Picnic Color Scale"),
                    horizontal_spacing=0.02)

# Add the first choropleth map with "Electric" color scale
fig.add_trace(go.Choropleth(locations=country_counts_df['Country'],
                            z=country_counts_df['Count'],
                            locationmode='country names',
                            colorscale='Electric',
                            colorbar=dict(len=0.45, x=0.46, title="Count")),
              1, 1)

# Add the second choropleth map with "Picnic" color scale
fig.add_trace(go.Choropleth(locations=country_counts_df['Country'],
                            z=country_counts_df['Count'],
                            locationmode='country names',
                            colorscale='Picnic',
                            colorbar=dict(len=0.45, x=1, title="Count")),
              1, 2)

# Update layout for a better fit and to set the title
fig.update_layout(title_text="Number of Netflix Titles by Country with Different Color Scales", height=400)

# Display the figure
fig.show()

Abstract/Ellobrate¶

In [5]:
#abstract/#ellobrate
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from mpl_toolkits.mplot3d import Axes3D


# Load the dataset
df = pd.read_csv("netflix_titles.csv")


# Preprocess duration data
df['duration'] = df['duration'].str.extract('(\d+)').astype(float)  # Extract numeric part and convert to float


# Initial scatter plot in 3D
def plot_scatter_3d(granularity):
    num_points = int(len(df) * granularity)
    sampled_data = df.sample(num_points)

    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(sampled_data['release_year'], range(len(sampled_data)), sampled_data['duration'])
    ax.set_xlabel('Release Year')
    ax.set_ylabel('Index of Netflix Titles')
    ax.set_zlabel('Duration (minutes)')
    ax.set_title('3D Scatter Plot with Adjustable Granularity')
    plt.grid(True)
    plt.show()


# Create slider for adjusting granularity
granularity_slider = widgets.FloatSlider(min=0.1, max=1, step=0.1, value=0.5, description='Granularity:')
widgets.interactive(plot_scatter_3d, granularity=granularity_slider)



import pandas as pd
import plotly.express as px


# Load the Netflix dataset
netflix_data = pd.read_csv('netflix_titles.csv')


# Create a scatter plot
sc1 = px.scatter(netflix_data, x='release_year', y='duration', hover_data=['title', 'rating', 'listed_in'])


def update_zoom(trace, points, selector):
    x = points.x[0]
    y = points.y[0]
    sc1.update_layout(
        xaxis=dict(
            range=[x - 1, x + 1],
            type='linear',
            autorange=False
        ),
        yaxis=dict(
            range=[y - 10, y + 10],
            type='linear',
            autorange=False
        )
    )


sc1.update_layout(
    updatemenus=[
        dict(
            type='buttons',
            showactive=False,
            buttons=[
                dict(
                    label='Reset Zoom',
                    method='update',
                    args=[
                        {},
                        {'xaxis.autorange': True, 'yaxis.autorange': True}
                    ]
                )
            ]
        )
    ]
)


sc1.show()

Filter¶

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
# Load the dataset
df = pd.read_csv("netflix_titles.csv")


# Function to filter and plot data based on selected genre
def plot_by_genre(genre):
    if genre == 'All':
        filtered_df = df
    else:
        filtered_df = df[df['listed_in'].str.contains(genre, case=False, na=False)]

    plt.figure(figsize=(10, 6))
    plt.hist(filtered_df['release_year'], bins=20, color='skyblue', edgecolor='black')
    plt.xlabel('Release Year')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of Release Year for Genre: {genre}')
    plt.grid(True)
    plt.show()


# Create dropdown menu for selecting genre
genre_dropdown = widgets.Dropdown(options=['All'] + df['listed_in'].unique().tolist(), description='Genre:')
widgets.interactive(plot_by_genre, genre=genre_dropdown)


# Create a slider to adjust the year range
year_range = widgets.IntRangeSlider(min=df['release_year'].min(), max=df['release_year'].max(), value=[df['release_year'].min(), df['release_year'].max()], description='Year Range:')


# Update the plot based on year range and genre selection
def update_plot(genre, year_range):
    filtered_df = df[(df['release_year'] >= year_range[0]) & (df['release_year'] <= year_range[1])]
    if genre != 'All':
        filtered_df = filtered_df[filtered_df['listed_in'].str.contains(genre, case=False, na=False)]

    plt.figure(figsize=(10, 6))
    plt.hist(filtered_df['release_year'], bins=20, color='skyblue', edgecolor='black')
    plt.xlabel('Release Year')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of Release Year for Genre: {genre}')
    plt.grid(True)
    plt.show()


# Connect the widgets to the update function
widgets.interactive(update_plot, genre=genre_dropdown, year_range=year_range)
interactive(children=(Dropdown(description='Genre:', options=('All', 'Documentaries', 'International TV Shows,…

Connect¶

In [7]:
import pandas as pd
import plotly.express as px


# Load the dataset
netflix_data = pd.read_csv('netflix_titles.csv')


# Assuming netflix_data is your DataFrame and it's already been loaded
# Convert 'release_year' to numeric, just in case it hasn't been done
netflix_data['release_year'] = pd.to_numeric(netflix_data['release_year'], errors='coerce')


# Drop any rows with NaN in 'release_year' after conversion
netflix_data_clean = netflix_data.dropna(subset=['release_year'])


# Binning 'release_year' into decades
bins = [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030]
labels = ['1900s', '1910s', '1920s', '1930s', '1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s', '2020s']
netflix_data_clean['decade'] = pd.cut(netflix_data_clean['release_year'], bins=bins, labels=labels, right=False)


# Ensure 'type' and 'rating' are treated as categorical
netflix_data_clean['type'] = netflix_data_clean['type'].astype('category')
netflix_data_clean['rating'] = netflix_data_clean['rating'].astype('category')


# Prepare the data for the parallel coordinates plot
# Filter to reduce data size for visualization if necessary
data_for_visualization = netflix_data_clean[['type', 'decade', 'rating']].dropna()


# Generating the parallel coordinates plot
fig = px.parallel_categories(data_for_visualization, dimensions=['type', 'decade', 'rating'],
                             color_continuous_scale=px.colors.sequential.Inferno,
                             labels={'type': 'Content Type', 'decade': 'Release Decade', 'rating': 'Rating'})


# Show the plot
fig.show()
In [8]:
# Import necessary libraries
import pandas as pd
import plotly.express as px

# Load the Netflix dataset
netflix_data = pd.read_csv('netflix_titles.csv')

# Create a polar line chart
fig3 = px.line_polar(netflix_data, r='rating', theta='type')

# Show the polar line chart
fig3.show()